{
  "cells": [
    {
      "cell_type": "code",
      "execution_count": 1,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "# %%shell\n",
        "# # Installs the latest dev build of TVM from PyPI. If you wish to build\n",
        "# # from source, see https://tvm.apache.org/docs/install/from_source.html\n",
        "# pip install apache-tvm --pre"
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "# 在 Relay 中使用管道执行器\n",
        "\n",
        "**原作者**: [Hua Jiang](https://github.com/huajsj)\n",
        "\n",
        "这是关于如何在 Relay 中使用“管道执行器”（Pipeline Executor）的简短教程。"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 2,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "import tvm\n",
        "from tvm import te\n",
        "import numpy as np\n",
        "from tvm.contrib import graph_executor as runtime\n",
        "from tvm.relay.op.contrib.cutlass import partition_for_cutlass\n",
        "from tvm import relay\n",
        "from tvm.relay import testing\n",
        "from tvm.contrib.cutlass import finalize_modules\n",
        "\n",
        "img_size = 8"
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## 创建简单网络，可以是预训练的模型\n",
        "\n",
        "创建非常简单的网络进行演示。它由卷积、batch normalization、dense 和 ReLU 激活组成。"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 3,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "def get_network():\n",
        "    out_channels = 16\n",
        "    batch_size = 1\n",
        "    data = relay.var(\"data\", relay.TensorType((batch_size, 3, img_size, img_size), \"float16\"))\n",
        "    dense_weight = relay.var(\n",
        "        \"dweight\", relay.TensorType((batch_size, 16 * img_size * img_size), \"float16\")\n",
        "    )\n",
        "    weight = relay.var(\"weight\")\n",
        "    bn_gamma = relay.var(\"bn_gamma\")\n",
        "    bn_beta = relay.var(\"bn_beta\")\n",
        "    bn_mmean = relay.var(\"bn_mean\")\n",
        "    bn_mvar = relay.var(\"bn_var\")\n",
        "    simple_net = relay.nn.conv2d(\n",
        "        data=data, weight=weight, kernel_size=(3, 3), channels=out_channels, padding=(1, 1)\n",
        "    )\n",
        "    simple_net = relay.nn.batch_norm(simple_net, bn_gamma, bn_beta, bn_mmean, bn_mvar)[0]\n",
        "    simple_net = relay.nn.relu(simple_net)\n",
        "    simple_net = relay.nn.batch_flatten(simple_net)\n",
        "    simple_net = relay.nn.dense(simple_net, dense_weight)\n",
        "    simple_net = relay.Function(relay.analysis.free_vars(simple_net), simple_net)\n",
        "    data_shape = (batch_size, 3, img_size, img_size)\n",
        "    net, params = testing.create_workload(simple_net)\n",
        "    return net, params, data_shape\n",
        "\n",
        "\n",
        "net, params, data_shape = get_network()"
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## 将网络分成两个子图\n",
        "\n",
        "单元测试中的 'graph_split' 函数只是一个例子。用户可以创建自定义的逻辑来分割计算图。"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 5,
      "metadata": {},
      "outputs": [],
      "source": [
        "import inspect\n",
        "import os\n",
        "tutorial_dir = os.path.dirname(inspect.getfile(lambda: None))\n",
        "os.sys.path.append(os.path.join(tutorial_dir, \"../../../tests/python/relay\"))\n",
        "from test_pipeline_executor import graph_split"
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "将网络分成两个子图。"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 6,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "split_config = [{\"op_name\": \"nn.relu\", \"op_index\": 0}]\n",
        "subgraphs = graph_split(net[\"main\"], split_config, params)"
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "生成的子图应该如下所示。"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 7,
      "metadata": {},
      "outputs": [
        {
          "data": {
            "text/plain": [
              "[def @main(%data: Tensor[(1, 3, 8, 8), float16] /* ty=Tensor[(1, 3, 8, 8), float16] */) {\n",
              "   %0 = nn.conv2d(%data, meta[relay.Constant][0] /* ty=Tensor[(16, 3, 3, 3), float16] */, padding=[1, 1, 1, 1], channels=16, kernel_size=[3, 3]) /* ty=Tensor[(1, 16, 8, 8), float16] */;\n",
              "   %1 = nn.batch_norm(%0, meta[relay.Constant][1] /* ty=Tensor[(16), float16] */, meta[relay.Constant][2] /* ty=Tensor[(16), float16] */, meta[relay.Constant][3] /* ty=Tensor[(16), float16] */, meta[relay.Constant][4] /* ty=Tensor[(16), float16] */) /* ty=(Tensor[(1, 16, 8, 8), float16], Tensor[(16), float16], Tensor[(16), float16]) */;\n",
              "   %2 = %1.0 /* ty=Tensor[(1, 16, 8, 8), float16] */;\n",
              "   nn.relu(%2) /* ty=Tensor[(1, 16, 8, 8), float16] */\n",
              " }\n",
              " ,\n",
              " def @main(%data_n_0: Tensor[(1, 16, 8, 8), float16] /* ty=Tensor[(1, 16, 8, 8), float16] */) {\n",
              "   %0 = nn.batch_flatten(%data_n_0) /* ty=Tensor[(1, 1024), float16] */;\n",
              "   nn.dense(%0, meta[relay.Constant][0] /* ty=Tensor[(1, 1024), float16] */, units=None) /* ty=Tensor[(1, 1), float16] */\n",
              " }\n",
              " ]"
            ]
          },
          "execution_count": 7,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "subgraphs"
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## 使用 cutlass target 构建子图"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "cutlass = tvm.target.Target(\n",
        "    {\n",
        "        \"kind\": \"cutlass\",\n",
        "        \"sm\": int(tvm.target.Target(\"cuda\").arch.split(\"_\")[1]),\n",
        "        \"use_3xtf32\": True,\n",
        "        \"split_k_slices\": [1],\n",
        "        \"profile_all_alignments\": False,\n",
        "        \"find_first_valid\": True,\n",
        "        \"use_multiprocessing\": True,\n",
        "        \"use_fast_math\": False,\n",
        "        \"tmp_dir\": \"./tmp\",\n",
        "    },\n",
        "    host=tvm.target.Target(\"llvm\"),\n",
        ")\n",
        "\n",
        "\n",
        "def cutlass_build(mod, target, params=None, target_host=None, mod_name=\"default\"):\n",
        "    target = [target, cutlass]\n",
        "    lib = relay.build_module.build(\n",
        "        mod, target=target, params=params, target_host=target_host, mod_name=mod_name\n",
        "    )\n",
        "    return lib"
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## 使用管道执行器在管道中运行两个子图\n",
        "\n",
        "在 cmake 中将 `USE_PIPELINE_EXECUTOR` 设置为 `ON`，并将 `USE_CUTLASS` 设置为 `ON`。"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 9,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "from tvm.contrib import graph_executor, pipeline_executor, pipeline_executor_build"
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "- 创建子图管道配置。\n",
        "- 将子图模块与目标关联。\n",
        "- 使用 CUTLASS BYOC 构建第二个子图模块。"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 10,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "mod0, mod1 = subgraphs[0], subgraphs[1]\n",
        "# Use cutlass as the codegen.\n",
        "mod1 = partition_for_cutlass(mod1)"
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "获取管道执行器配置对象。"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 11,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "pipe_config = pipeline_executor_build.PipelineConfig()"
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "设置子图模块的编译目标。"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 12,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "pipe_config[mod0].target = \"llvm\"\n",
        "pipe_config[mod0].dev = tvm.cpu(0)"
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "设置第二个子图模块的编译目标为cuda。"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "pipe_config[mod1].target = \"cuda\"\n",
        "pipe_config[mod1].dev = tvm.device(\"cuda\", 0)\n",
        "pipe_config[mod1].build_func = cutlass_build\n",
        "pipe_config[mod1].export_cc = \"nvcc\"\n",
        "# 通过连接子图模块创建管道。\n",
        "# 全局输入将 forwarded 到第一个名为 mod0 的模块的输入接口\n",
        "pipe_config[\"input\"][\"data\"].connect(pipe_config[mod0][\"input\"][\"data\"])\n",
        "# mod0 的第一个输出将被转发到 mod1 的输入接口\n",
        "pipe_config[mod0][\"output\"][0].connect(pipe_config[mod1][\"input\"][\"data_n_0\"])\n",
        "# mod1 的第一个输出将是第一个全局输出。\n",
        "pipe_config[mod1][\"output\"][0].connect(pipe_config[\"output\"][0])"
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "管道配置如下。\n",
        "\n",
        "```\n",
        "\"\"\"\n",
        "print(pipe_config)\n",
        " Inputs\n",
        "  |data: mod0:data\n",
        "\n",
        " output\n",
        "  |output(0) : mod1.output(0)\n",
        "\n",
        " connections\n",
        "  |mod0.output(0)-> mod1.data_n_0\n",
        "\"\"\"\n",
        "```"
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## 构建管道执行器"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "with tvm.transform.PassContext(opt_level=3):\n",
        "    pipeline_mod_factory = pipeline_executor_build.build(pipe_config)"
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "将参数配置导出为文件。"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "directory_path = tvm.contrib.utils.tempdir().temp_dir\n",
        "os.makedirs(directory_path, exist_ok=True)\n",
        "config_file_name = pipeline_mod_factory.export_library(directory_path)"
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## 使用 load 函数创建并初始化 PipelineModule"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "pipeline_module = pipeline_executor.PipelineModule.load_library(config_file_name)"
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## 运行 pipeline executor\n",
        "\n",
        "\n",
        "分配输入数据。"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "data = np.random.uniform(-1, 1, size=data_shape).astype(\"float16\")\n",
        "pipeline_module.set_input(\"data\", tvm.nd.array(data))"
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "在管道模式下运行两个子图以异步获取输出或同步。在下面的示例中，它是同步的。"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "pipeline_module.run()\n",
        "outputs = pipeline_module.get_output()"
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## 使用 graph_executor 进行验证\n",
        "\n",
        "使用 graph_executor 按顺序运行这两个子图以获得输出。"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "target = \"llvm\"\n",
        "dev0 = tvm.device(target, 0)\n",
        "lib0 = relay.build_module.build(mod0, target, params=params)\n",
        "module0 = runtime.GraphModule(lib0[\"default\"](dev0))\n",
        "cuda = tvm.target.Target(\"cuda\", host=tvm.target.Target(\"llvm\"))\n",
        "lib1 = relay.build_module.build(mod1, [cuda, cutlass], params=params)\n",
        "lib1 = finalize_modules(lib1, \"compile.so\", \"./tmp\")\n",
        "\n",
        "dev1 = tvm.device(\"cuda\", 0)\n",
        "\n",
        "module1 = runtime.GraphModule(lib1[\"default\"](dev1))\n",
        "\n",
        "module0.set_input(\"data\", data)\n",
        "module0.run()\n",
        "out_shape = (1, 16, img_size, img_size)\n",
        "out = module0.get_output(0, tvm.nd.empty(out_shape, \"float16\"))\n",
        "module1.set_input(\"data_n_0\", out)\n",
        "module1.run()\n",
        "out_shape = (1, 1)\n",
        "out = module1.get_output(0, tvm.nd.empty(out_shape, \"float16\"))"
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "验证结果："
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "np.testing.assert_allclose(outputs[0].numpy(), out.numpy())"
      ]
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.10.11"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}
